from IPython.display import Image
Image(url="https://chicagoagentmagazine.com/wp-content/uploads/2017/01/chicago-housing-stock-zillow-valuable-gains-2016-2015-affordability.png")
For our data, we used a website called apify web scraper to scrape the airbnb website for all listings in and around the city of Chicago. We then wrote this to a csv to use throughout the project.
We also used a dataset that provided us with a saftey score and median house price for each neighborhod
The following code is after we had the initial listings csv
import pandas as pd
# Initial dataset of all of our listings
df = pd.read_csv("listings.csv")
df = df.fillna(method="ffill")
# Getting median, mean, and count of listings grouped by neighborhood
df_median=df.groupby('neighbourhood').agg(avg_price=('price','mean'),
median_price=('price','median'),
count=('price','count'),
latitude=('latitude','median'),
longitude=('longitude','median'),
number_of_reviews_ltm=("number_of_reviews_ltm",'mean')).reset_index()
#Sort by median
df_median=df_median.sort_values(by="median_price",ascending=False)
Creating a map that shows each neighborhood in Chicago along with its average Airbnb price per night. The color is scaled to show variation in price.
#! pip install plotly==5.11.0
import plotly.express as px
import pandas as pd
import io
df = pd.read_csv("median_location.csv")
df = df.fillna(method="ffill")
df2 = df.drop(df.index[df['price'] > 300])
df2 = df.head(1000)
mapbox_api_key = 'pk.eyJ1IjoiZ3JhY2VvbnBlZGFscyIsImEiOiJjbGI5cHBseGMwMXU1M3BudGlseGwzdGl3In0.K8iL6PXjXLvM5jqWrdp_dA'
px.set_mapbox_access_token(mapbox_api_key)
fig = px.scatter_mapbox(
df2,
lat="latitude",
lon="longitude",
color="price",
range_color=[0, 300],
size="price",
color_continuous_scale= px.colors.sequential.Rainbow,
hover_name="neighbourhood",
zoom=10
)
fig.update_layout(
height=1200
)
fig.show()
Grouping the airbnb listings by neighborhood and finding median price per night.
Combining median house price and safety scores of each neighborhood in Chicago.
Creating a new column called payback period that shows how many days your new airbnb would need to be rented out for to pay back the full price of the house
import pandas as pd
df = pd.read_csv("listings.csv")
df = df.fillna(method="ffill")
# Gets median and returns other info location, min nights, and number of reviews.
median=df.groupby('neighbourhood').median()
median=median[['latitude', 'longitude', 'price', 'minimum_nights', 'number_of_reviews' ]]
median['price']
/tmp/ipykernel_51/2558230421.py:2: FutureWarning: The default value of numeric_only in DataFrameGroupBy.median is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
neighbourhood
Albany Park 119.0
Archer Heights 79.5
Armour Square 115.5
Ashburn 150.0
Auburn Gresham 119.0
...
West Lawn 57.0
West Pullman 125.0
West Ridge 102.0
West Town 176.0
Woodlawn 133.0
Name: price, Length: 77, dtype: float64
# Gets Mean, median, and number of observations in each neighbourhood
mean=df.groupby('neighbourhood').agg(avg_price=('price','mean'),
median_price=('price','median'),
count=('price','count')).reset_index()
#Sort by median
mean.sort_values(by="median_price",ascending=False)
| neighbourhood | avg_price | median_price | count | |
|---|---|---|---|---|
| 63 | South Deering | 384.875000 | 489.0 | 8 |
| 16 | Clearing | 359.214286 | 399.0 | 14 |
| 48 | Near South Side | 310.574468 | 290.5 | 188 |
| 41 | Loop | 266.280220 | 208.0 | 364 |
| 38 | Lincoln Park | 269.978571 | 203.0 | 280 |
| ... | ... | ... | ... | ... |
| 43 | Mckinley Park | 124.255814 | 51.0 | 43 |
| 70 | West Englewood | 53.500000 | 39.5 | 8 |
| 50 | New City | 58.600000 | 39.0 | 30 |
| 62 | South Chicago | 82.407407 | 25.0 | 27 |
| 13 | Calumet Heights | 57.433333 | 25.0 | 30 |
77 rows × 4 columns
import pandas as pd
safe_house = pd.read_csv("SAFETY_HOUSESALE.csv")
df = pd.read_csv("listings.csv")
mean=df.groupby('neighbourhood').agg(avg_price=('price','mean'),
median_price=('price','median'),
count=('price','count'),latitude=('latitude','median')
,longitude=('longitude','median')).reset_index()
mean=mean.sort_values(by="median_price",ascending=False)
combined = safe_house.set_index('Neighborhood').join(mean.set_index('neighbourhood'),how = 'inner')
combined.columns
Index(['Safety_Score', 'Area', 'Median_sale_price', 'avg_price',
'median_price', 'count', 'latitude', 'longitude'],
dtype='object')
df = pd.DataFrame(combined)
finaldata=pd.DataFrame(combined,columns=['Safety_Score', 'Area', 'Median_sale_price', 'avg_price',
'median_price', 'count', "latitude", 'longitude'])
finaldata['payback_period']=round(finaldata['Median_sale_price']/finaldata["median_price"],0)
finaldata['avg_price']=round(finaldata['avg_price'],0)
finaldata = finaldata.dropna()
finaldata
| Safety_Score | Area | Median_sale_price | avg_price | median_price | count | latitude | longitude | payback_period | |
|---|---|---|---|---|---|---|---|---|---|
| Albany Park | 58.0 | Irving Park/Albany Park | 515000.0 | 138.0 | 119.0 | 64 | 41.966613 | -87.712714 | 4328.0 |
| Archer Heights | 68.0 | Bridgeport/Brighton Park | 296500.0 | 78.0 | 79.5 | 6 | 41.805606 | -87.727050 | 3730.0 |
| Armour Square | 32.0 | Bridgeport/Brighton Park | 296500.0 | 194.0 | 115.5 | 72 | 41.849599 | -87.633210 | 2567.0 |
| Ashburn | 63.0 | Beverly/Morgan Park | 250000.0 | 165.0 | 150.0 | 19 | 41.735360 | -87.703300 | 1667.0 |
| Auburn Gresham | 46.0 | Auburn Gresham/Chatham | 153500.0 | 168.0 | 119.0 | 13 | 41.746310 | -87.654830 | 1290.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| West Englewood | 46.0 | Englewood/Greater Grand Crossing | 130000.0 | 54.0 | 39.5 | 8 | 41.777566 | -87.661960 | 3291.0 |
| West Garfield Park | 23.0 | Humboldt Park/Garfield Park | 211500.0 | 84.0 | 86.0 | 26 | 41.874930 | -87.724105 | 2459.0 |
| West Ridge | 70.0 | Lincoln Square/North Center | 674000.0 | 132.0 | 102.0 | 125 | 42.014730 | -87.690860 | 6608.0 |
| West Town | 65.0 | West Town/Near West Side | 818250.0 | 299.0 | 176.0 | 721 | 41.900080 | -87.672044 | 4649.0 |
| Woodlawn | 52.0 | Bronzeville/Hyde Park | 320000.0 | 169.0 | 133.0 | 121 | 41.781541 | -87.606250 | 2406.0 |
72 rows × 9 columns
from ipywidgets import widgets, interactive, Layout
df = pd.read_csv("finaldata.csv")
Below is a dropdown widget, which allows users to view any particular neighborhood they wish, with all as the default. It also provides a box plot of the median prices as well as safety score.
w_area = widgets.Dropdown(
description = 'Area:',
options = ["All"] + sorted(set(df.Area)),
value = "All",
style = {"description_width": '50px'},
layout = Layout(width="15%")
)
def view(location):
if location == "All":
df_tmp = df
else:
df_tmp = df[df.Area == location]
title = "Median price of {} neighborhood".format(location)
df_tmp[["median_price",'Safety_Score']].plot(kind="box",title=title,grid=True,figsize=(5,5))
display(df_tmp)
i = interactive(view, location=w_area)
display(i)
interactive(children=(Dropdown(description='Area:', layout=Layout(width='15%'), options=('All', 'Auburn Gresha…
df[['payback_period']].idxmin() # Shows us row 60 has the smallest payback period
payback_period 60 dtype: int64
Index of most profitable neighborhood
print('Most profitable neighborhood')
df.iloc[60]
Most profitable neighborhood
Neighborhood South Deering Safety_Score 68.0 Area South Chicago/West Pullman Median_sale_price 138500.0 avg_price 385.0 median_price 489.0 count 8 latitude 41.713493 longitude -87.562173 payback_period 283.0 Name: 60, dtype: object
The most profitable neighborhood looks to be South Deering, which is located in the South Chicago/West Pullman area.
print(df[['Median_sale_price']].idxmin())
print(df[['Median_sale_price']].idxmax())
Median_sale_price 23 dtype: int64 Median_sale_price 37 dtype: int64
Index value of most and least expensive neighborhoods to purchase homes.
print('\nMost expensive neighborhood\n')
print(df.iloc[37])
print('Cheapest Neighborhood\n')
print(df.iloc[23])
Most expensive neighborhood Neighborhood Lake View Safety_Score 74.0 Area Lake View/Lincoln Park Median_sale_price 1288500.0 avg_price 214.0 median_price 165.0 count 589 latitude 41.94416 longitude -87.6542 payback_period 7809.0 Name: 37, dtype: object Cheapest Neighborhood Neighborhood Englewood Safety_Score 45.0 Area Englewood/Greater Grand Crossing Median_sale_price 130000.0 avg_price 82.0 median_price 75.0 count 25 latitude 41.78866 longitude -87.63432 payback_period 1733.0 Name: 23, dtype: object
The most expensive neighborhood is Lake View, while the cheapest neighborhood is Englewood.
safest = df.sort_values(by="Safety_Score", ascending=False)
fastestPayoff = df.sort_values(by="payback_period")
cheapestMedianPrice = df.sort_values(by="median_price")
selection = widgets.ToggleButtons(
description = 'Select Desired Property:',
options = ['All','Safest','Fastest Pay-off','Cheapest Price'],
value = 'All',
style = {"description_width": '200px'},
layout = Layout(width="90%")
)
def view(choice):
if choice == "All":
df_tmp = df.head(10)
elif choice == "Fastest Pay-off":
df_tmp = fastestPayoff.head(10)
elif choice == 'Cheapest Price':
df_tmp = cheapestMedianPrice.head(10)
else:
df_tmp = safest.head(10)
display(df_tmp)
i = interactive(view, choice=selection)
display(i)
interactive(children=(ToggleButtons(description='Select Desired Property:', layout=Layout(width='90%'), option…
This widget shows options to display the top ten safest, fastest pay-off, or cheapest properties based on which button is selected.
def priority_visiting(dataframe,desiredprice,desiredsafety):
temp = []
for idx, row in dataframe.iterrows():
safety = row.Safety_Score
medianprice = row.median_price
neighborhood = row.Neighborhood
if (safety >= desiredsafety) & (medianprice <= desiredprice):
temp.append([neighborhood,medianprice,safety])
return sorted(temp)
This function allows an Airbnb renter to enter in preferred saftety scores and nightly rental price, in order to see the best neighborhood options.
priority_visiting(df,100,90)
[['Beverly', 75.0, 100.0], ['Edison Park', 69.0, 92.0], ['Mount Greenwood', 94.0, 98.0], ['Norwood Park', 96.0, 100.0]]
Example above: Determine neighborhoods for someone that would like a rental price of less than $100 per day and a safety score greater than 90.
def priority_owning(dataframe,desiredpayback,desiredsafety):
temp = []
for idx, row in dataframe.iterrows():
safety = row.Safety_Score
medianprice = row.median_price
neighborhood = row.Neighborhood
area = row.Area
payback = row.payback_period
if (safety >= desiredsafety) & (payback <= desiredpayback):
temp.append([neighborhood,area,payback,safety])
return sorted(temp)
A user defined function that would allow an Airbnb owner to enter in preferred saftety scores and payback period, in order to see the best neighborhood options.
best = priority_owning(df,1500,75)
for item in best:
print ('Neighborhood: {}\nArea: {}\nPayback Period(in days): {}\nSafety Score(out of 100): {}\n'.format(item[0],item[1],item[2],item[3]))
Neighborhood: Clearing Area: Gage Park/West Lawn Payback Period(in days): 689.0 Safety Score(out of 100): 76.0 Neighborhood: Garfield Ridge Area: Gage Park/West Lawn Payback Period(in days): 1455.0 Safety Score(out of 100): 80.0 Neighborhood: Hegewisch Area: South Chicago/West Pullman Payback Period(in days): 1473.0 Safety Score(out of 100): 76.0 Neighborhood: Pullman Area: South Chicago/West Pullman Payback Period(in days): 1030.0 Safety Score(out of 100): 77.0
Example above: Determining the best neighborhood to invest in an Airbnb based on payback period less than 1500 days and saftey score greater than 75.
best_mapping = []
best_mapping=df[(df.Safety_Score >= 75) & (df.payback_period <= 1500)]
best_mapping
| Neighborhood | Safety_Score | Area | Median_sale_price | avg_price | median_price | count | latitude | longitude | payback_period | |
|---|---|---|---|---|---|---|---|---|---|---|
| 16 | Clearing | 76.0 | Gage Park/West Lawn | 275000.0 | 359.0 | 399.0 | 14 | 41.779615 | -87.773815 | 689.0 |
| 27 | Garfield Ridge | 80.0 | Gage Park/West Lawn | 275000.0 | 233.0 | 189.0 | 27 | 41.797160 | -87.785070 | 1455.0 |
| 30 | Hegewisch | 76.0 | South Chicago/West Pullman | 138500.0 | 90.0 | 94.0 | 14 | 41.649395 | -87.530065 | 1473.0 |
| 55 | Pullman | 77.0 | South Chicago/West Pullman | 138500.0 | 150.0 | 134.5 | 12 | 41.688997 | -87.607185 | 1030.0 |
Dataframe of neighborhoods for someone that would like to own an Airbnb property that has a payback period of 1500 or less and a safety score of 75 or better.
import plotly.express as px
import pandas as pd
import io
mapbox_api_key = 'pk.eyJ1IjoiZ3JhY2VvbnBlZGFscyIsImEiOiJjbGI5cHBseGMwMXU1M3BudGlseGwzdGl3In0.K8iL6PXjXLvM5jqWrdp_dA'
px.set_mapbox_access_token(mapbox_api_key)
fig = px.scatter_mapbox(
best_mapping,
lat='latitude',
lon='longitude',
color='payback_period',
range_color=[500, 1500],
size='Safety_Score',
color_continuous_scale= px.colors.sequential.Rainbow,
hover_name='Neighborhood',
zoom=10
)
fig.update_layout(
height=1200
)
fig.show()
Geographical representation of Airbnb properties with payback periods of 1500 or less and safety scores of 75 or better.
We chose Clearing because it has a low payback period which means that we will be able to pay off our house quickly through renting it out through Airbnb. It also has a high saftey score which is important because renters prefer an area that is safe.
#!pip install -U matplotlib
import matplotlib.pyplot as plt
from matplotlib import image
img = image.imread("Clearing_Illinois.png")
print(type(img))
plt.imshow(img)
<class 'numpy.ndarray'>
<matplotlib.image.AxesImage at 0x7f04cdae9fa0>
Image of the Clearing neighborhood